notebook.community



In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np



In [2]:

    
df_raw = pd.read_csv("../../data/toxicity_annotations/raw/toxicity_for_ellery.csv")



In [3]:

    
df = df_raw.copy()



In [4]:

    
df.shape









    Out[4]:





(1671721, 28)

Clean Annotations



In [5]:

    
df['query'].value_counts()









    Out[5]:





user_blocked       504821
user_random        504800
article_blocked    331055
article_random     331045
Name: query, dtype: int64



In [6]:

    
df['ns'] = df['query'].apply(lambda x: x.split('_')[0])
df['sample'] = df['query'].apply(lambda x: x.split('_')[1])

Make random and blocked samples disjoint



In [7]:

    
df.index = df.rev_id
df.sample_count = df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts()
print(df.sample_count.value_counts())
# just set them all to random
df['sample'][df.sample_count == 2] = 'random'
del df.sample_count
print(df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts())









    



1    166415
2       215
Name: rev_id, dtype: int64






    



/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



1    166630
Name: rev_id, dtype: int64

Binarize toxicity



In [8]:

    
df['toxicity'] = (df['toxicity_score'] < 0).apply(int)



In [9]:

    
df['toxicity_score'].value_counts(dropna=False)









    Out[9]:





 0.0    812717
 1.0    572471
-1.0    200512
-2.0     45825
 2.0     23123
NaN      17073
Name: toxicity_score, dtype: int64



In [10]:

    
df['toxicity'].value_counts(dropna=False)









    Out[10]:





0    1425384
1     246337
Name: toxicity, dtype: int64

Remove answers to test questions



In [11]:

    
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])









    



# annotations:  1671721

Remove annotations where revision could not be read



In [12]:

    
from baselines import remove_na
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])









    



# annotations:  1657460



In [13]:

    
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])









    



# annotations:  1651459

Make sure that each rev was only annotated by the same worker once



In [14]:

    
df.groupby(['rev_id', '_worker_id']).size().value_counts()









    Out[14]:





1    1651443
2          8
dtype: int64



In [15]:

    
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])









    



# annotations:  1651451

Filter out annotations for revisions with duplicated diff content



In [16]:

    
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])



In [17]:

    
u_comments = comments.drop_duplicates(subset = ['comment_text'])
print(u_comments.shape[0])



In [18]:

    
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])









    



# annotations:  1605360

Check that labels are not None



In [19]:

    
df['toxicity_score'].value_counts(dropna=False)









    Out[19]:





 0.0    792657
 1.0    558147
-1.0    190434
-2.0     42075
 2.0     22047
Name: toxicity_score, dtype: int64



In [20]:

    
df['toxicity'].value_counts(dropna=False)









    Out[20]:





0    1372851
1     232509
Name: toxicity, dtype: int64

Remove annotations from all revisions that were annotated less than 8 times



In [21]:

    
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index



In [22]:

    
counts['n'].value_counts().head()









    Out[22]:





10    154088
9       2635
11      2063
8        665
7        336
Name: n, dtype: int64



In [23]:

    
counts_enough = counts.query("n>=8")



In [24]:

    
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])









    



# annotations:  1601838

Get set of labeled comments



In [25]:

    
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'].notnull()
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)

Add Splits



In [26]:

    
elements = np.array(["train", "dev", "test"])
probabilities = np.array([0.6, 0.2, 0.2])
df_comments['split'] = np.random.choice(elements, size=df_comments.shape[0], p=list(probabilities))



In [27]:

    
df_comments['split'].value_counts()









    Out[27]:





train    96334
dev      31869
test     31854
Name: split, dtype: int64

rename workers



In [28]:

    
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')
df.shape

# save worker id mapping
df_workers.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotations_worker_id_map.tsv'), sep = '\t', index = False)



In [169]:

    
# fix legacy special token issues

df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))

# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )



In [170]:

    
# rename some columns
df_comments = df_comments.rename(columns={
                        'clean_diff': 'comment',
                        'rev_timestamp': 'timestamp',
        
                       })
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape









    Out[170]:





(159686, 7)



In [171]:

    
# get set of human labels

df_toxicity_labels = df[['rev_id', 'anon_id', 'toxicity', 'toxicity_score']]

df_toxicity_labels = df_toxicity_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_toxicity_labels = df_toxicity_labels.sort_values('rev_id')



In [172]:

    
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotated_comments.tsv'), sep = '\t', index = False)
df_toxicity_labels.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotations.tsv'), sep = '\t', index = False)



In [173]:

    
pd.read_csv(os.path.join( "../../data/figshare", 'toxicity_annotated_comments.tsv'), sep = '\t').shape









    Out[173]:





(159686, 7)



In [174]:

    
pd.read_csv(os.path.join( "../../data/figshare", 'toxicity_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape









    Out[174]:





(159686, 4)



In [176]:

    
df_comments.head()









    Out[176]:






  
    
      
      rev_id
      comment
      year
      logged_in
      ns
      sample
      split
    
  
  
    
      1315373
      2232.0
      This:NEWLINE_TOKEN:One can make an analogy in ...
      2002
      True
      article
      random
      train
    
    
      223073
      4216.0
      `NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ...
      2002
      True
      user
      random
      train
    
    
      480113
      8953.0
      Elected or Electoral? JHK
      2002
      False
      article
      random
      test
    
    
      1099396
      26547.0
      `This is such a fun entry.   DevotchkaNEWLINE_...
      2002
      True
      article
      random
      train
    
    
      941623
      28959.0
      Please relate the ozone hole to increases in c...
      2002
      True
      article
      random
      test



In [ ]:

	rev_id	comment	year	logged_in	ns	sample	split
1315373	2232.0	This:NEWLINE_TOKEN:One can make an analogy in ...	2002	True	article	random	train
223073	4216.0	`NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ...	2002	True	user	random	train
480113	8953.0	Elected or Electoral? JHK	2002	False	article	random	test
1099396	26547.0	`This is such a fun entry. DevotchkaNEWLINE_...	2002	True	article	random	train
941623	28959.0	Please relate the ozone hole to increases in c...	2002	True	article	random	test